library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(corrplot) # For correlation plots
## corrplot 0.95 loaded
library(FactoMineR) # For PCA
library(factoextra) # For PCA plots
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(vegan) # For CCA
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.6-8
library(ggplot2)
library(rsample) # For sampling of data (split, analysis,..)
library(rpart)
library(rpart.plot)
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
library(tibble)
library(tidyr)
library(gridExtra)
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:randomForest':
##
## combine
##
## The following object is masked from 'package:dplyr':
##
## combine
library(factoextra)
library(caret)
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:vegan':
##
## tolerance
##
## The following object is masked from 'package:purrr':
##
## lift
rm(list = ls())
path <- "/Users/eamon/Desktop/MathToolsProject/R_Project/MathematicalTools/lizard.csv"
# Read a CSV file
data <- read.csv(path)
str(data)
## 'data.frame': 740 obs. of 24 variables:
## $ Species : chr "Agama Agama" "Agama Agama" "Agama hispida" "Caimanops amphiboluroides" ...
## $ Genus : chr "Agama" "Agama" "Agama" "Caimanops" ...
## $ Family : chr "Agamidae" "Agamidae" "Agamidae" "Agamidae" ...
## $ Population : chr "Suacocco, Liberia 1952-1953" "Ibadan, NIgeria" "Kalahari Desert, Africa, 1969-1970" "Australia" ...
## $ Longitude : num -9.05 3.92 21.25 119.1 119.62 ...
## $ Latitude : num 7.08 7.4 -27.03 -28.5 -27.08 ...
## $ Average.Female.adult.weight..g.: num NA NA 31.9 NA 10 ...
## $ SD.Female.adult.weight..g. : chr NA NA "6.16" NA ...
## $ Sample.Size.Female.adult.weight: int NA NA 28 NA 17 75 NA 41 741 120 ...
## $ Mean.F.SVL.adults..mm. : num 85 97 94.8 94 62.9 ...
## $ SD.F.SVL.adults..mm. : num NA NA 6.91 7.07 4.43 ...
## $ Sample.size.Mean.F.SVL.adults : int 63 NA 51 2 18 75 NA 41 756 120 ...
## $ F.SVL.at.maturity..mm. : num 75 NA 76 89 57 44 48 49 47 51 ...
## $ Offspring.SVL..mm. : num 32 NA 29 38 22 23 24 25 22 26 ...
## $ Mean.Clutch.Size : num 7.5 5.5 13.4 12 5.36 ...
## $ Sample.size.Clutch.Size. : int NA NA 45 2 14 28 NA 13 40 39 ...
## $ Mode.of.reproduction : chr "Oviparous" "Oviparous" "Oviparous" "Oviparous" ...
## $ Clutches.per.year : num 2 NA NA NA NA NA 2 NA NA NA ...
## $ Clutch.Frequency : num 2 2 1 1 1 1 2 1 2 2 ...
## $ RCM : num NA NA 0.305 NA 0.166 ...
## $ Foraging.Mode : chr "Sit and wait" "Sit and wait" "Sit and wait" "Sit and wait" ...
## $ Distribution : chr "Tropical" "Tropical" "Temperate" "Temperate" ...
## $ Prefered.Habitat.Type : chr "Saxicolous" "Saxicolous" "Semi-arboreal" "Semi-arboreal" ...
## $ Source : chr "(Harris 1964)" "(Daniel 1960)" "(Pianka 1986)" "(Pianka 2013d)" ...
# Are clutch frequency and clutches per year the same?
cor(data$Clutch.Frequency, data$Clutches.per.year, use = "complete.obs")
## [1] 0.7994683
plot(data$Clutch.Frequency~data$Clutches.per.year)
unique(data$Clutch.Frequency)
## [1] 2.00 1.00 NA 0.50 0.75
unique(data$Clutches.per.year)
## [1] 2.00 NA 3.00 1.00 4.00 6.00 0.50 0.75 2.50 1.50 1.30 3.50 5.00 5.50
# Are "F.SVL.at.maturity..mm." and "Mean.F.SVL.adults..mm." the same?
cor(data$F.SVL.at.maturity..mm., data$Mean.F.SVL.adults..mm., use="complete.obs")
## [1] 0.974189
plot(data$F.SVL.at.maturity..mm.~data$Mean.F.SVL.adults..mm.)
Seems like Clutch Frequency and Clutches per Year are highly related,
but Clutches Per Year goes up to 6 and Clutch Frequency only ranges
between 1 and 2. Seems like “F.SVL.at.maturity..mm.” and
“Mean.F.SVL.adults..mm.” are the same. Therefore one of each pair should
be excluded.
##Initialise Settings of the Analysis
# These variables are kept as required to answer the research questions.
# This will result in the "data_filtered" and "data_clean" dataframes.
variables_to_keep <- c(
### Categorical Variables
#"Species",
#"Genus",
"Family",
#"Population",
"Mode.of.reproduction",
#"Source",
"Distribution",
"Prefered.Habitat.Type",
"Foraging.mode",
### Location
#"Longitude",
#"Latitude",
### Numerical Variables
"Average.Female.adult.weight..g.",
"Mean.F.SVL.adults..mm.",
#"F.SVL.at.maturity..mm.", # Removed as "Mean.F.SVL.adults..mm." is almost identical.
"Offspring.SVL..mm.",
"Mean.Clutch.Size",
#"Clutches.per.year",
"Clutch.Frequency",
"RCM"
### Standard Deviations and Sample Sizes
#"SD.F.SVL.adults..mm.",
#"Sample.size.Mean.F.SVL.adults",
#"SD.Female.adult.weight..g.",
#"Sample.Size.Female.adult.weight"
)
# Specify the variables to be included in the PCA
# This will be result in the "data_scaled" dataframe
variables_to_use_PCA <- c(
### Location
#"Longitude",
#"Latitude",
### Means and Numerical
"Average.Female.adult.weight..g.",
"Mean.F.SVL.adults..mm.",
#"F.SVL.at.maturity..mm.",
"Offspring.SVL..mm.",
"Mean.Clutch.Size",
"Clutches.per.year",
#"Clutch.Frequency",
"RCM"
### Standard Deviations and Sample Sizes
#"SD.Female.adult.weight..g.",
#"SD.F.SVL.adults..mm.",
#"Sample.Size.Female.adult.weight",
#"Sample.size.Mean.F.SVL.adults",
)
# Specify the Variables to Use in the Random Forests Analysis
variables_to_use_RF <- c(
### Categorical Variables
#"Species",
#"Genus",
#"Family",
#"Population",
"Mode.of.reproduction",
#"Source",
"Distribution",
"Prefered.Habitat.Type",
"Foraging.Mode",
"RCM",
### Location
#"Longitude",
#"Latitude",
### Numerical Variables
"Average.Female.adult.weight..g.",
"Mean.F.SVL.adults..mm.",
"F.SVL.at.maturity..mm.",
"Offspring.SVL..mm.",
"Mean.Clutch.Size",
"Clutches.per.year",
"Clutch.Frequency",
"RCM"
### Standard Deviations and Sample Sizes
#"SD.F.SVL.adults..mm.",
#"Sample.size.Mean.F.SVL.adults",
#"SD.Female.adult.weight..g.",
#"Sample.Size.Female.adult.weight"
)
variables_to_use_CCA <- c(
#"Distribution",
#"Foraging.Mode",
#"Mode.of.reproduction",
"Prefered.Habitat.Type",
"Family",
"Latitude"
)
# Set numeric variables as numeric
data$SD.Female.adult.weight..g. <- as.numeric(data$SD.Female.adult.weight..g.)
## Warning: NAs introduced by coercion
str(data)
## 'data.frame': 740 obs. of 24 variables:
## $ Species : chr "Agama Agama" "Agama Agama" "Agama hispida" "Caimanops amphiboluroides" ...
## $ Genus : chr "Agama" "Agama" "Agama" "Caimanops" ...
## $ Family : chr "Agamidae" "Agamidae" "Agamidae" "Agamidae" ...
## $ Population : chr "Suacocco, Liberia 1952-1953" "Ibadan, NIgeria" "Kalahari Desert, Africa, 1969-1970" "Australia" ...
## $ Longitude : num -9.05 3.92 21.25 119.1 119.62 ...
## $ Latitude : num 7.08 7.4 -27.03 -28.5 -27.08 ...
## $ Average.Female.adult.weight..g.: num NA NA 31.9 NA 10 ...
## $ SD.Female.adult.weight..g. : num NA NA 6.16 NA 2.29 ...
## $ Sample.Size.Female.adult.weight: int NA NA 28 NA 17 75 NA 41 741 120 ...
## $ Mean.F.SVL.adults..mm. : num 85 97 94.8 94 62.9 ...
## $ SD.F.SVL.adults..mm. : num NA NA 6.91 7.07 4.43 ...
## $ Sample.size.Mean.F.SVL.adults : int 63 NA 51 2 18 75 NA 41 756 120 ...
## $ F.SVL.at.maturity..mm. : num 75 NA 76 89 57 44 48 49 47 51 ...
## $ Offspring.SVL..mm. : num 32 NA 29 38 22 23 24 25 22 26 ...
## $ Mean.Clutch.Size : num 7.5 5.5 13.4 12 5.36 ...
## $ Sample.size.Clutch.Size. : int NA NA 45 2 14 28 NA 13 40 39 ...
## $ Mode.of.reproduction : chr "Oviparous" "Oviparous" "Oviparous" "Oviparous" ...
## $ Clutches.per.year : num 2 NA NA NA NA NA 2 NA NA NA ...
## $ Clutch.Frequency : num 2 2 1 1 1 1 2 1 2 2 ...
## $ RCM : num NA NA 0.305 NA 0.166 ...
## $ Foraging.Mode : chr "Sit and wait" "Sit and wait" "Sit and wait" "Sit and wait" ...
## $ Distribution : chr "Tropical" "Tropical" "Temperate" "Temperate" ...
## $ Prefered.Habitat.Type : chr "Saxicolous" "Saxicolous" "Semi-arboreal" "Semi-arboreal" ...
## $ Source : chr "(Harris 1964)" "(Daniel 1960)" "(Pianka 1986)" "(Pianka 2013d)" ...
# Convert all blank values of categorical variables to NA
data<-data %>%
mutate(across(where(is.character), ~ na_if(.x, ""))) %>%
mutate(across(where(is.character), as.factor))
str(data)
## 'data.frame': 740 obs. of 24 variables:
## $ Species : Factor w/ 337 levels "Acratosaura mentalis",..: 2 2 3 50 84 85 86 86 87 87 ...
## $ Genus : Factor w/ 137 levels "Acratosaura",..: 2 2 2 14 32 32 32 32 32 32 ...
## $ Family : Factor w/ 33 levels "Agamidae","Anguidae",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Population : Factor w/ 288 levels "Acre, 1996","Africa-A, 1969-1970",..: 242 107 121 33 33 33 159 33 34 35 ...
## $ Longitude : num -9.05 3.92 21.25 119.1 119.62 ...
## $ Latitude : num 7.08 7.4 -27.03 -28.5 -27.08 ...
## $ Average.Female.adult.weight..g.: num NA NA 31.9 NA 10 ...
## $ SD.Female.adult.weight..g. : num NA NA 6.16 NA 2.29 ...
## $ Sample.Size.Female.adult.weight: int NA NA 28 NA 17 75 NA 41 741 120 ...
## $ Mean.F.SVL.adults..mm. : num 85 97 94.8 94 62.9 ...
## $ SD.F.SVL.adults..mm. : num NA NA 6.91 7.07 4.43 ...
## $ Sample.size.Mean.F.SVL.adults : int 63 NA 51 2 18 75 NA 41 756 120 ...
## $ F.SVL.at.maturity..mm. : num 75 NA 76 89 57 44 48 49 47 51 ...
## $ Offspring.SVL..mm. : num 32 NA 29 38 22 23 24 25 22 26 ...
## $ Mean.Clutch.Size : num 7.5 5.5 13.4 12 5.36 ...
## $ Sample.size.Clutch.Size. : int NA NA 45 2 14 28 NA 13 40 39 ...
## $ Mode.of.reproduction : Factor w/ 2 levels "Oviparous","Viviparous": 1 1 1 1 1 1 1 1 1 1 ...
## $ Clutches.per.year : num 2 NA NA NA NA NA 2 NA NA NA ...
## $ Clutch.Frequency : num 2 2 1 1 1 1 2 1 2 2 ...
## $ RCM : num NA NA 0.305 NA 0.166 ...
## $ Foraging.Mode : Factor w/ 2 levels "Active","Sit and wait": 2 2 2 2 2 2 2 2 2 2 ...
## $ Distribution : Factor w/ 2 levels "Temperate","Tropical": 2 2 1 1 1 1 1 1 1 1 ...
## $ Prefered.Habitat.Type : Factor w/ 8 levels "Aquatic","Arboreal",..: 6 6 7 7 6 8 8 8 8 8 ...
## $ Source : Factor w/ 236 levels "(Alcala and Brown 1967)",..: 73 37 121 130 236 129 33 129 119 119 ...
# Check for Duplicates
duplicates <- data[duplicated(data), ]
print("Duplicate Rows:")
## [1] "Duplicate Rows:"
print(duplicates)
## Species Genus Family
## 307 Sceloporus jarrovi Sceloporus Phrynosomatidae
## 558 Aspidoscelis exsanguis Aspidoscelis Teiidae
## 567 Aspidoscelis tesselatus Aspidoscelis Teiidae
## 712 Xantusia vigilis Xantusia Xantusiidae
## 739 <NA> <NA> <NA>
## 740 <NA> <NA> <NA>
## Population Longitude Latitude
## 307 USA, 1962-1964 -109.58 31.00
## 558 USA -104.00 30.00
## 567 USA -104.00 30.00
## 712 Antelope Valley, Mojave Desert -118.25 34.75
## 739 <NA> NA NA
## 740 <NA> NA NA
## Average.Female.adult.weight..g. SD.Female.adult.weight..g.
## 307 15.53 NA
## 558 12.65 NA
## 567 17.95 NA
## 712 NA NA
## 739 NA NA
## 740 NA NA
## Sample.Size.Female.adult.weight Mean.F.SVL.adults..mm. SD.F.SVL.adults..mm.
## 307 NA 77.90 NA
## 558 NA 74.99 NA
## 567 NA 82.70 NA
## 712 NA 41.00 NA
## 739 NA NA NA
## 740 NA NA NA
## Sample.size.Mean.F.SVL.adults F.SVL.at.maturity..mm. Offspring.SVL..mm.
## 307 NA 69 NA
## 558 NA 63 NA
## 567 NA 66 NA
## 712 NA NA NA
## 739 NA NA NA
## 740 NA NA NA
## Mean.Clutch.Size Sample.size.Clutch.Size. Mode.of.reproduction
## 307 6.750 NA Viviparous
## 558 2.868 92 Oviparous
## 567 3.240 55 Oviparous
## 712 1.790 NA Viviparous
## 739 NA NA <NA>
## 740 NA NA <NA>
## Clutches.per.year Clutch.Frequency RCM Foraging.Mode Distribution
## 307 NA 1 0.1502 Sit and wait Temperate
## 558 NA 1 0.1325 Active Temperate
## 567 NA 1 0.1261 Active Temperate
## 712 1 1 NA Sit and wait Temperate
## 739 NA NA NA <NA> <NA>
## 740 NA NA NA <NA> <NA>
## Prefered.Habitat.Type Source
## 307 Saxicolous (Tinkle 1973)
## 558 Terrestrial (Schall 1978)
## 567 Terrestrial (Schall 1978)
## 712 Terrestrial (Miller 1951, 1954)
## 739 <NA> <NA>
## 740 <NA> <NA>
# Remove duplicates
data <- data[!duplicated(data), ]
# Replace empty cells with NA for factors
data[data == ''] <- NA
# Calculate percentage of NA values for each column
na_percentage <- colSums(is.na(data)) / nrow(data) * 100
# Convert NA percentages to a data frame
na_df <- data.frame(
Column = names(na_percentage),
NA_Percentage = na_percentage
)
# Create bar plot
ggplot(na_df, aes(x = Column, y = NA_Percentage)) +
geom_bar(stat = "identity", fill = "skyblue") +
labs(
title = "Percentage of NA Values per Column",
x = "Columns",
y = "Percentage (%)"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1) # Rotate x-axis labels
)
Here we removed the variables that we didn’t plan to use in any part of the analysis. This was done before filtering out NAs to avoid unneccessary data loss. All rows of the RCM column which are NA are also NA for mean female adult weight (which makes sense as they’re related).
# Select only the desired variables
data_filtered <- data %>%
select(any_of(variables_to_keep))
data_clean <- na.omit(data_filtered)
nrow(data)
## [1] 734
# Too many rows lost if all NAs removed
nrow(data_clean)
## [1] 230
# Create new dataset by selecting only variables to use and removing rows with NAs
data_PCA <- data %>%
select(any_of(variables_to_use_PCA)) %>%
na.omit()
# Create ancillary dataset with all original variables but filtered to keep only rows present in the PCA dataset
# This will be used for coloration of the PCA graphs
data_PCA_all_vars <- data %>%
filter(row_number() %in% rownames(data_PCA))
data_PCA <- data_PCA %>%
#Only numeric columns are selected
mutate_all(.funs = scale)
### CorrPlot
data_PCA %>%
cor(use="pairwise.complete.obs") %>% # Calculate the empirical correlation matrix
corrplot() # Then graph this matrix
result_pca <- PCA(data_PCA,
scale.unit = TRUE, # Option to center and scale data (useless here)
ncp = 18, # Number of components to keep (here, all)
graph = FALSE)
names(result_pca) # It's a list
## [1] "eig" "var" "ind" "svd" "call"
result_pca
## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 155 individuals, described by 6 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. for the variables"
## 4 "$var$cor" "correlations variables - dimensions"
## 5 "$var$cos2" "cos2 for the variables"
## 6 "$var$contrib" "contributions of the variables"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$call" "summary statistics"
## 12 "$call$centre" "mean of the variables"
## 13 "$call$ecart.type" "standard error of the variables"
## 14 "$call$row.w" "weights for the individuals"
## 15 "$call$col.w" "weights for the variables"
fviz_eig(result_pca, choice = "variance")
### Correlation between Each Variable and the Principle Components
corrplot(result_pca$var$cor)
# Representation in the first principal plane
fviz_pca_var(result_pca,
axes = c(1, 2)) # Number of axes to represent
fviz_pca_var(result_pca,
axes = c(1, 3)) # Number of axes to represent
fviz_pca_var(result_pca,
axes = c(2, 3)) # Number of axes to represent
fviz_pca_var(result_pca,
axes = c(1, 4))
### Plot the Individuals with the Variable Arrows and the Categorical
Coloring
Color_label <- "Distribution"
# Plot on Seperate Plots
fviz_pca_biplot(result_pca,
axes = c(1, 2),
col.ind = data_PCA_all_vars[[Color_label]])
fviz_pca_biplot(result_pca,
axes = c(1, 3),
col.ind = data_PCA_all_vars[[Color_label]])
fviz_pca_biplot(result_pca,
axes = c(2, 3),
col.ind = data_PCA_all_vars[[Color_label]])
fviz_pca_biplot(result_pca,
axes = c(3, 4),
col.ind = data_PCA_all_vars[[Color_label]])
### Plot all four on one plot
# Generate the plots
p1 <- fviz_pca_biplot(result_pca, axes = c(1, 2), col.ind = data_PCA_all_vars[[Color_label]])
p2 <- fviz_pca_biplot(result_pca, axes = c(1, 3), col.ind = data_PCA_all_vars[[Color_label]])
p3 <- fviz_pca_biplot(result_pca, axes = c(2, 3), col.ind = data_PCA_all_vars[[Color_label]])
p4 <- fviz_pca_biplot(result_pca, axes = c(3, 4), col.ind = data_PCA_all_vars[[Color_label]])
# Arrange the plots in a grid
grid.arrange(p1, p2, p3, p4, nrow = 2, ncol = 2)
# Create Clusters For Four Groups
first_kmeans <- data_PCA %>%
kmeans(centers = 4)
# Within Group Inertia
first_kmeans$tot.withinss
## [1] 390.3572
kmeans_5groups<- data_PCA %>%
kmeans(centers = 5)
kmeans_4groups<- data_PCA %>%
kmeans(centers = 4)
kmeans_3groups<- data_PCA %>%
kmeans(centers=3)
kmeans_2groups <- data_PCA %>%
kmeans(centers=2)
# Extract the Clustering Groups
clusters_5groups <- kmeans_5groups$cluster
clusters_4groups <- kmeans_4groups$cluster
clusters_3groups <- kmeans_3groups$cluster
clusters_2groups <- kmeans_2groups$cluster
# Representation in the first principal plane
fviz_pca_biplot(result_pca,
axes = c(1, 2),
col.ind = clusters_2groups)
fviz_pca_biplot(result_pca,
axes = c(1, 3),
col.ind = clusters_2groups)
fviz_pca_biplot(result_pca,
axes = c(2, 3),
col.ind = clusters_2groups)
fviz_pca_biplot(result_pca,
axes = c(3, 4),
col.ind = clusters_2groups)
# Save each of the plots
p1A <- fviz_pca_biplot(result_pca, axes = c(1, 2), col.ind = as.factor(clusters_2groups), col.var = "black", title = "Coloration from K-Means Clustering")
p1B <- fviz_pca_biplot(result_pca, axes = c(1, 2), col.ind = data_PCA_all_vars[[Color_label]], col.var = "black", title = "Coloration from Distribution")
p2A <- fviz_pca_biplot(result_pca, axes = c(1, 3), col.ind = as.factor(clusters_2groups), col.var = "black", title = "Coloration from K-Means Clustering")
p2B <- fviz_pca_biplot(result_pca, axes = c(1, 3), col.ind = data_PCA_all_vars[[Color_label]], col.var = "black", title = "Coloration from Distribution")
p3A <- fviz_pca_biplot(result_pca, axes = c(2, 3), col.ind = as.factor(clusters_2groups), col.var = "black", title = "Coloration from K-Means Clustering")
p3B <- fviz_pca_biplot(result_pca, axes = c(2, 3), col.ind = data_PCA_all_vars[[Color_label]], col.var = "black", title = "Coloration from Distribution")
p4A <- fviz_pca_biplot(result_pca, axes = c(3, 4), col.ind = as.factor(clusters_2groups), col.var = "black", title = "Coloration from K-Means Clustering")
p4B <- fviz_pca_biplot(result_pca, axes = c(3, 4), col.ind = data_PCA_all_vars[[Color_label]], col.var = "black", title = "Coloration from Distribution")
# Plot each axis combination
grid.arrange(p1A, p1B, nrow = 1, ncol = 2)
grid.arrange(p2A, p2B, nrow = 1, ncol = 2)
grid.arrange(p3A, p3B, nrow = 1, ncol = 2)
grid.arrange(p4A, p4B, nrow = 1, ncol = 2)
# Read data
data_counts <- read.table("donnees_comptage_genre.txt",
sep = ";",
header = TRUE,
row.names = 1)
# Convert to long tibble
data_counts_long <- data_counts %>%
rownames_to_column(var = "Zone") %>% # Create Zone column from row names
# Now we transform into length by aggregating all columns except zone into
# 2 columns, one giving the Genus name, one giving the number of individuals
pivot_longer(cols = -c("Zone"), # Variable not affected by aggregation
names_to = "Genus", # Old column names will be
# brought into a "Genus" variable
values_to = "NbIndividuals") # Numbers are aggregated in a new
# column NbIndiviuds
dim(data_counts) # Old dimensions
## [1] 9 8
data_counts_long
## # A tibble: 72 × 3
## Zone Genus NbIndividuals
## <chr> <chr> <int>
## 1 Alluvial_1 Dipterocarpus 85
## 2 Alluvial_1 Eusideroxylon 204
## 3 Alluvial_1 Garcinia 5
## 4 Alluvial_1 Gluta 114
## 5 Alluvial_1 Hydnocarpus 197
## 6 Alluvial_1 Pentace 8
## 7 Alluvial_1 Shorea 342
## 8 Alluvial_1 Tristaniopsis 0
## 9 Alluvial_2 Dipterocarpus 209
## 10 Alluvial_2 Eusideroxylon 165
## # ℹ 62 more rows
# Select all four variables to be used in the CA and CCA, omit NAs
data_CCA <- data %>%
# Select only the variables to use for the CA and CCA
select(any_of(variables_to_use_CCA)) %>%
# Exclude NAs
na.omit()
# Make a frequency table containing just 2 categorical variables for the CA
# With each pair of levels on a seperate row
freq_CA_long <- data_CCA %>%
select(Family, Prefered.Habitat.Type) %>%
table() %>%
as_tibble %>%
rename(Freq = "n",
Habitat = "Prefered.Habitat.Type")
# Make a tibble with the count of each combination
freq_CA_short <- data_CCA %>%
select(Prefered.Habitat.Type, Family) %>%
count(Prefered.Habitat.Type, Family) %>%
pivot_wider(
names_from = Family,
values_from = n,
values_fill = 0)
# Convert to dataframe and create rownames (for use creating average profile)
freq_CA_short_df <- freq_CA_short %>%
as.data.frame() %>%
column_to_rownames(var = "Prefered.Habitat.Type")
freq_CA_long
## # A tibble: 264 × 3
## Family Habitat Freq
## <chr> <chr> <int>
## 1 Agamidae Aquatic 0
## 2 Anguidae Aquatic 0
## 3 Anniellidae Aquatic 0
## 4 Carphodactylidae Aquatic 0
## 5 Chamaeleonidae Aquatic 0
## 6 Cordylidae Aquatic 0
## 7 Corytophanidae Aquatic 0
## 8 Crotaphytidae Aquatic 0
## 9 Dactyloidae Aquatic 1
## 10 Diplodactylidae Aquatic 0
## # ℹ 254 more rows
freq_CA_short
## # A tibble: 8 × 34
## Prefered.Habitat.Type Dactyloidae Gymnophthalmidae Shinisauridae Teiidae
## <fct> <int> <int> <int> <int>
## 1 Aquatic 1 4 1 1
## 2 Arboreal 25 0 0 3
## 3 Bromelicolous 0 0 0 0
## 4 Fossorial 0 2 0 0
## 5 Psammophilus 0 0 0 0
## 6 Saxicolous 0 0 0 0
## 7 Semi-arboreal 0 0 0 0
## 8 Terrestrial 14 42 0 120
## # ℹ 29 more variables: Agamidae <int>, Chamaeleonidae <int>,
## # Corytophanidae <int>, Diplodactylidae <int>, Gekkonidae <int>,
## # Hoplocercidae <int>, Leiosauridae <int>, Liolaemidae <int>,
## # Opluridae <int>, Phrynosomatidae <int>, Phyllodactylidae <int>,
## # Polychrotidae <int>, Scincidae <int>, Sphaerodactylidae <int>,
## # Tropiduridae <int>, Varanidae <int>, Anguidae <int>, Anniellidae <int>,
## # Lacertidae <int>, Cordylidae <int>, Crotaphytidae <int>, Iguanidae <int>, …
freq_CA_short_df
## Dactyloidae Gymnophthalmidae Shinisauridae Teiidae Agamidae
## Aquatic 1 4 1 1 0
## Arboreal 25 0 0 3 5
## Bromelicolous 0 0 0 0 0
## Fossorial 0 2 0 0 0
## Psammophilus 0 0 0 0 0
## Saxicolous 0 0 0 0 3
## Semi-arboreal 0 0 0 0 3
## Terrestrial 14 42 0 120 16
## Chamaeleonidae Corytophanidae Diplodactylidae Gekkonidae
## Aquatic 0 0 0 0
## Arboreal 1 1 5 17
## Bromelicolous 0 0 0 0
## Fossorial 0 0 0 0
## Psammophilus 1 0 0 0
## Saxicolous 0 0 0 8
## Semi-arboreal 0 0 1 0
## Terrestrial 0 1 5 14
## Hoplocercidae Leiosauridae Liolaemidae Opluridae Phrynosomatidae
## Aquatic 0 0 0 0 0
## Arboreal 1 3 6 1 15
## Bromelicolous 0 0 0 0 0
## Fossorial 0 0 0 0 0
## Psammophilus 0 0 0 0 6
## Saxicolous 0 0 3 0 18
## Semi-arboreal 0 0 0 0 0
## Terrestrial 0 2 12 0 43
## Phyllodactylidae Polychrotidae Scincidae Sphaerodactylidae
## Aquatic 0 0 0 0
## Arboreal 5 4 16 9
## Bromelicolous 0 0 10 0
## Fossorial 0 0 9 0
## Psammophilus 0 0 1 0
## Saxicolous 10 0 4 0
## Semi-arboreal 0 0 0 2
## Terrestrial 8 0 80 8
## Tropiduridae Varanidae Anguidae Anniellidae Lacertidae Cordylidae
## Aquatic 0 0 0 0 0 0
## Arboreal 15 1 0 0 0 0
## Bromelicolous 0 0 0 0 0 0
## Fossorial 0 0 2 3 0 0
## Psammophilus 1 0 0 0 1 0
## Saxicolous 33 0 1 0 3 5
## Semi-arboreal 0 0 0 0 0 0
## Terrestrial 8 2 5 0 24 0
## Crotaphytidae Iguanidae Leiocephalidae Xenosauridae
## Aquatic 0 0 0 0
## Arboreal 0 0 0 0
## Bromelicolous 0 0 0 0
## Fossorial 0 0 0 0
## Psammophilus 0 0 0 0
## Saxicolous 1 2 6 23
## Semi-arboreal 0 0 0 0
## Terrestrial 10 3 2 0
## Carphodactylidae Eublepharidae Helodermatidae Pygopodidae
## Aquatic 0 0 0 0
## Arboreal 0 0 0 0
## Bromelicolous 0 0 0 0
## Fossorial 0 0 0 0
## Psammophilus 0 0 0 0
## Saxicolous 0 0 0 0
## Semi-arboreal 0 0 0 0
## Terrestrial 2 1 1 3
## Xantusiidae
## Aquatic 0
## Arboreal 0
## Bromelicolous 0
## Fossorial 0
## Psammophilus 0
## Saxicolous 0
## Semi-arboreal 0
## Terrestrial 5
ggplot(freq_CA_long) +
aes(x = Family, y = Habitat, fill = Freq) + # fill says which column is used
# for filling
geom_raster() + # represents in raster mode
# and dress to make it pretty
scale_fill_viridis_c() + # Change color scale
theme(axis.text.x = element_text(angle = 90, hjust = 1))
### Calculate the Average Profile
# Create average profile
average_profile <- colSums(freq_CA_short_df) / sum(freq_CA_short_df)
average_profile
## Dactyloidae Gymnophthalmidae Shinisauridae Teiidae
## 0.054570259 0.065484311 0.001364256 0.169167804
## Agamidae Chamaeleonidae Corytophanidae Diplodactylidae
## 0.036834925 0.002728513 0.002728513 0.015006821
## Gekkonidae Hoplocercidae Leiosauridae Liolaemidae
## 0.053206003 0.001364256 0.006821282 0.028649386
## Opluridae Phrynosomatidae Phyllodactylidae Polychrotidae
## 0.001364256 0.111869031 0.031377899 0.005457026
## Scincidae Sphaerodactylidae Tropiduridae Varanidae
## 0.163710778 0.025920873 0.077762619 0.004092769
## Anguidae Anniellidae Lacertidae Cordylidae
## 0.010914052 0.004092769 0.038199181 0.006821282
## Crotaphytidae Iguanidae Leiocephalidae Xenosauridae
## 0.015006821 0.006821282 0.010914052 0.031377899
## Carphodactylidae Eublepharidae Helodermatidae Pygopodidae
## 0.002728513 0.001364256 0.001364256 0.004092769
## Xantusiidae
## 0.006821282
# Create the Count
count_habitat <- rowSums(freq_CA_short_df)
count_habitat
## Aquatic Arboreal Bromelicolous Fossorial Psammophilus
## 7 133 10 16 10
## Saxicolous Semi-arboreal Terrestrial
## 120 6 431
# Division Counts
CA_repartition <- (freq_CA_short_df / count_habitat) %>%
rbind(Average_profile = average_profile)
CA_repartition
## Dactyloidae Gymnophthalmidae Shinisauridae Teiidae
## Aquatic 0.14285714 0.57142857 0.142857143 0.14285714
## Arboreal 0.18796992 0.00000000 0.000000000 0.02255639
## Bromelicolous 0.00000000 0.00000000 0.000000000 0.00000000
## Fossorial 0.00000000 0.12500000 0.000000000 0.00000000
## Psammophilus 0.00000000 0.00000000 0.000000000 0.00000000
## Saxicolous 0.00000000 0.00000000 0.000000000 0.00000000
## Semi-arboreal 0.00000000 0.00000000 0.000000000 0.00000000
## Terrestrial 0.03248260 0.09744780 0.000000000 0.27842227
## Average_profile 0.05457026 0.06548431 0.001364256 0.16916780
## Agamidae Chamaeleonidae Corytophanidae Diplodactylidae
## Aquatic 0.00000000 0.000000000 0.000000000 0.00000000
## Arboreal 0.03759398 0.007518797 0.007518797 0.03759398
## Bromelicolous 0.00000000 0.000000000 0.000000000 0.00000000
## Fossorial 0.00000000 0.000000000 0.000000000 0.00000000
## Psammophilus 0.00000000 0.100000000 0.000000000 0.00000000
## Saxicolous 0.02500000 0.000000000 0.000000000 0.00000000
## Semi-arboreal 0.50000000 0.000000000 0.000000000 0.16666667
## Terrestrial 0.03712297 0.000000000 0.002320186 0.01160093
## Average_profile 0.03683492 0.002728513 0.002728513 0.01500682
## Gekkonidae Hoplocercidae Leiosauridae Liolaemidae Opluridae
## Aquatic 0.00000000 0.000000000 0.000000000 0.00000000 0.000000000
## Arboreal 0.12781955 0.007518797 0.022556391 0.04511278 0.007518797
## Bromelicolous 0.00000000 0.000000000 0.000000000 0.00000000 0.000000000
## Fossorial 0.00000000 0.000000000 0.000000000 0.00000000 0.000000000
## Psammophilus 0.00000000 0.000000000 0.000000000 0.00000000 0.000000000
## Saxicolous 0.06666667 0.000000000 0.000000000 0.02500000 0.000000000
## Semi-arboreal 0.00000000 0.000000000 0.000000000 0.00000000 0.000000000
## Terrestrial 0.03248260 0.000000000 0.004640371 0.02784223 0.000000000
## Average_profile 0.05320600 0.001364256 0.006821282 0.02864939 0.001364256
## Phrynosomatidae Phyllodactylidae Polychrotidae Scincidae
## Aquatic 0.00000000 0.00000000 0.000000000 0.00000000
## Arboreal 0.11278195 0.03759398 0.030075188 0.12030075
## Bromelicolous 0.00000000 0.00000000 0.000000000 1.00000000
## Fossorial 0.00000000 0.00000000 0.000000000 0.56250000
## Psammophilus 0.60000000 0.00000000 0.000000000 0.10000000
## Saxicolous 0.15000000 0.08333333 0.000000000 0.03333333
## Semi-arboreal 0.00000000 0.00000000 0.000000000 0.00000000
## Terrestrial 0.09976798 0.01856148 0.000000000 0.18561485
## Average_profile 0.11186903 0.03137790 0.005457026 0.16371078
## Sphaerodactylidae Tropiduridae Varanidae Anguidae
## Aquatic 0.00000000 0.00000000 0.000000000 0.000000000
## Arboreal 0.06766917 0.11278195 0.007518797 0.000000000
## Bromelicolous 0.00000000 0.00000000 0.000000000 0.000000000
## Fossorial 0.00000000 0.00000000 0.000000000 0.125000000
## Psammophilus 0.00000000 0.10000000 0.000000000 0.000000000
## Saxicolous 0.00000000 0.27500000 0.000000000 0.008333333
## Semi-arboreal 0.33333333 0.00000000 0.000000000 0.000000000
## Terrestrial 0.01856148 0.01856148 0.004640371 0.011600928
## Average_profile 0.02592087 0.07776262 0.004092769 0.010914052
## Anniellidae Lacertidae Cordylidae Crotaphytidae Iguanidae
## Aquatic 0.000000000 0.00000000 0.000000000 0.000000000 0.000000000
## Arboreal 0.000000000 0.00000000 0.000000000 0.000000000 0.000000000
## Bromelicolous 0.000000000 0.00000000 0.000000000 0.000000000 0.000000000
## Fossorial 0.187500000 0.00000000 0.000000000 0.000000000 0.000000000
## Psammophilus 0.000000000 0.10000000 0.000000000 0.000000000 0.000000000
## Saxicolous 0.000000000 0.02500000 0.041666667 0.008333333 0.016666667
## Semi-arboreal 0.000000000 0.00000000 0.000000000 0.000000000 0.000000000
## Terrestrial 0.000000000 0.05568445 0.000000000 0.023201856 0.006960557
## Average_profile 0.004092769 0.03819918 0.006821282 0.015006821 0.006821282
## Leiocephalidae Xenosauridae Carphodactylidae Eublepharidae
## Aquatic 0.000000000 0.0000000 0.000000000 0.000000000
## Arboreal 0.000000000 0.0000000 0.000000000 0.000000000
## Bromelicolous 0.000000000 0.0000000 0.000000000 0.000000000
## Fossorial 0.000000000 0.0000000 0.000000000 0.000000000
## Psammophilus 0.000000000 0.0000000 0.000000000 0.000000000
## Saxicolous 0.050000000 0.1916667 0.000000000 0.000000000
## Semi-arboreal 0.000000000 0.0000000 0.000000000 0.000000000
## Terrestrial 0.004640371 0.0000000 0.004640371 0.002320186
## Average_profile 0.010914052 0.0313779 0.002728513 0.001364256
## Helodermatidae Pygopodidae Xantusiidae
## Aquatic 0.000000000 0.000000000 0.000000000
## Arboreal 0.000000000 0.000000000 0.000000000
## Bromelicolous 0.000000000 0.000000000 0.000000000
## Fossorial 0.000000000 0.000000000 0.000000000
## Psammophilus 0.000000000 0.000000000 0.000000000
## Saxicolous 0.000000000 0.000000000 0.000000000
## Semi-arboreal 0.000000000 0.000000000 0.000000000
## Terrestrial 0.002320186 0.006960557 0.011600928
## Average_profile 0.001364256 0.004092769 0.006821282
ggplot(freq_CA_long) +
aes(x = Family, y = Habitat, fill = Freq) + # fill says which column is used
# for filling
geom_raster() + # represents in raster mode
# and dress to make it pretty
scale_fill_viridis_c() + # Change color scale
theme(axis.text.x = element_text(angle = 90, hjust = 1))
### CA
#
unique(data$Family)
## [1] Agamidae Anguidae Anniellidae Carphodactylidae
## [5] Chamaeleonidae Cordylidae Corytophanidae Crotaphytidae
## [9] Dactyloidae Diplodactylidae Eublepharidae Gekkonidae
## [13] Gymnophthalmidae Helodermatidae Hoplocercidae Iguanidae
## [17] Lacertidae Leiocephalidae Leiosauridae Liolaemidae
## [21] Opluridae Phrynosomatidae Phyllodactylidae Polychrotidae
## [25] Pygopodidae Scincidae Shinisauridae Sphaerodactylidae
## [29] Teiidae Tropiduridae Varanidae Xantusiidae
## [33] Xenosauridae <NA>
## 33 Levels: Agamidae Anguidae Anniellidae Carphodactylidae ... Xenosauridae
result_CA <- CA(freq_CA_short_df, graph = FALSE)
fviz_eig(result_CA)
fviz_ca_biplot(result_CA, repel = TRUE)
mean_latitude_df <- data_CCA %>%
group_by(Prefered.Habitat.Type) %>%
summarise(mean_latitude_abs = mean(abs(Latitude), na.rm = TRUE)) %>%
as.data.frame()
mean_latitude_df
## Prefered.Habitat.Type mean_latitude_abs
## 1 Aquatic 9.64369
## 2 Arboreal 15.16887
## 3 Bromelicolous 18.77975
## 4 Fossorial 28.29688
## 5 Psammophilus 29.26900
## 6 Saxicolous 18.32790
## 7 Semi-arboreal 21.67167
## 8 Terrestrial 20.77065
# Make a frequency table containing all 4 categorical variables to be used
result_CCA <- cca(freq_CA_short_df ~ mean_latitude_abs,
data = mean_latitude_df)
plot(result_CCA)
scores_CCA = scores(result_CCA)
ordiArrowMul(scores_CCA$biplot)
## [1] 3.722339
anova(result_CCA)
## Permutation test for cca under reduced model
## Permutation: free
## Number of permutations: 999
##
## Model: cca(formula = freq_CA_short_df ~ mean_latitude_abs, data = mean_latitude_df)
## Df ChiSquare F Pr(>F)
## Model 1 0.23944 1.2409 0.788
## Residual 6 1.15773
anova(result_CCA,by="margin")
## Permutation test for cca under reduced model
## Marginal effects of terms
## Permutation: free
## Number of permutations: 999
##
## Model: cca(formula = freq_CA_short_df ~ mean_latitude_abs, data = mean_latitude_df)
## Df ChiSquare F Pr(>F)
## mean_latitude_abs 1 0.23944 1.2409 0.753
## Residual 6 1.15773
##Split Data ###Filter Data Variables
data_RF <- data_clean %>%
select(any_of(variables_to_use_RF))
str(data_RF)
## 'data.frame': 230 obs. of 9 variables:
## $ Mode.of.reproduction : Factor w/ 2 levels "Oviparous","Viviparous": 1 1 1 1 1 1 1 1 1 1 ...
## $ Distribution : Factor w/ 2 levels "Temperate","Tropical": 1 1 1 1 1 1 1 1 1 1 ...
## $ Prefered.Habitat.Type : Factor w/ 8 levels "Aquatic","Arboreal",..: 7 6 8 8 8 8 8 8 8 8 ...
## $ RCM : num 0.305 0.166 0.192 0.114 0.143 ...
## $ Average.Female.adult.weight..g.: num 31.86 10.04 3.58 4.74 6.78 ...
## $ Mean.F.SVL.adults..mm. : num 94.8 62.9 48 52.6 56 ...
## $ Offspring.SVL..mm. : num 29 22 23 25 22 26 24 22 25 31 ...
## $ Mean.Clutch.Size : num 13.4 5.36 2.71 2.31 3.4 ...
## $ Clutch.Frequency : num 1 1 1 1 2 2 2 2 1 1 ...
## - attr(*, "na.action")= 'omit' Named int [1:504] 1 2 4 7 11 12 15 17 21 22 ...
## ..- attr(*, "names")= chr [1:504] "1" "2" "4" "7" ...
split_data <- data_RF %>%
initial_split(prop = 0.8, strata = "Distribution")
# Train and Test Data
train_data <- analysis(split_data)
test_data <- assessment(split_data)
forest <- randomForest(Distribution ~ ., # Formula for prediction
data = train_data, # Data for training
ntree = 5000, # Number of trees
maxnodes = 10, # Number of maximum leaves for each tree
mtry = 4, # Number of variables for each tree
importance = TRUE) # Computation of importance
# Fast way of computing accuracy with the pipe operator %>%
predict(forest, newdata = select(test_data, - Distribution)) %>%
table(prediction = ., truth = test_data$Distribution) %>%
{sum(diag(.)) / sum(.)}
## [1] 0.7446809
varImpPlot(forest)
### Hyper Parameter Tuning
# Initialisation
params_CV <- trainControl(method = "repeatedcv",
number = 10,
repeats = 5)
predictor_forest <- caret::train(Distribution ~ .,
data = data_RF,
method = "rf",
metric = "Accuracy",
trControl = params_CV,
tuneGrid = expand.grid(mtry = 2:12))
as.data.frame(predictor_forest$results) %>%
ggplot(aes(x = mtry, y = Accuracy)) +
geom_point()